Inlcuding Continents in Dataset:
data <- read.csv("group_3_data.csv")
continents <- read.csv("Countries by continents.csv")
continents <- continents %>% rename(reference_area = Country)
data <- merge(data, continents, by = "reference_area")
Statistics by continent (mean, median, sd, max, min):
datacont <- data[,-1] %>% group_by(Continent)
variables <- c("education", "employment", "absence_of_legal_discrimination", "access_to_productive_and_financial_assets", "reproductive_discrim", "restricted_civil_liberties", "discrimination_in_the_family")
statstable_bycont <- datacont %>% group_by(Continent) %>%
summarise(across(all_of(variables), list(mean = mean,
median = median,
sd = sd,
max = max,
min = min)))
statstable_bycont <- data.frame(statstable_bycont)
statstable_bycont
## Continent education_mean education_median education_sd education_max
## 1 Africa 6.219914 6.144916 2.28506222 9.65739
## 2 Asia 9.200349 9.232720 2.60907629 13.30556
## 3 Europe 12.140683 12.224860 1.13651010 13.84053
## 4 North America 9.383249 9.096670 3.86157270 13.91498
## 5 Oceania 12.873235 12.873235 0.08859335 12.93588
## 6 South America 9.517324 9.234145 1.03437457 11.35365
## education_min employment_mean employment_median employment_sd employment_max
## 1 2.161845 55.00311 58.90759 19.512818 81.04615
## 2 3.896490 49.56838 54.04757 20.638198 76.96047
## 3 9.778190 69.68894 71.93355 9.169670 79.98667
## 4 4.552000 58.49051 57.45274 10.928704 75.06139
## 5 12.810590 76.73362 76.73362 2.103119 78.22075
## 6 8.333614 63.29520 62.82961 8.258768 75.80638
## employment_min absence_of_legal_discrimination_mean
## 1 25.60620 77.50000
## 2 14.21400 70.91346
## 3 46.26808 93.48558
## 4 43.04505 82.32143
## 5 75.24649 97.18750
## 6 53.31744 86.32812
## absence_of_legal_discrimination_median absence_of_legal_discrimination_sd
## 1 76.8750 8.3560887
## 2 75.6250 13.3258818
## 3 93.7500 5.8018668
## 4 86.2500 13.0389755
## 5 97.1875 0.4419417
## 6 86.8750 5.2124240
## absence_of_legal_discrimination_max absence_of_legal_discrimination_min
## 1 88.125 64.375
## 2 90.625 46.875
## 3 100.000 78.750
## 4 100.000 61.250
## 5 97.500 96.875
## 6 95.000 79.375
## access_to_productive_and_financial_assets_mean
## 1 30.10000
## 2 33.22308
## 3 11.64231
## 4 20.94286
## 5 18.85000
## 6 26.52500
## access_to_productive_and_financial_assets_median
## 1 35.10
## 2 38.80
## 3 7.00
## 4 18.60
## 5 18.85
## 6 20.65
## access_to_productive_and_financial_assets_sd
## 1 10.734990
## 2 13.638191
## 3 9.945539
## 4 7.351611
## 5 2.050610
## 6 16.262380
## access_to_productive_and_financial_assets_max
## 1 42.6
## 2 50.4
## 3 39.6
## 4 34.0
## 5 20.3
## 6 60.6
## access_to_productive_and_financial_assets_min reproductive_discrim_mean
## 1 12.0 47.222222
## 2 3.5 53.846154
## 3 0.8 1.923077
## 4 15.1 75.000000
## 5 17.4 0.000000
## 6 8.6 43.750000
## reproductive_discrim_median reproductive_discrim_sd reproductive_discrim_max
## 1 50.0 29.166667 75
## 2 75.0 37.977726 100
## 3 0.0 9.805807 50
## 4 75.0 35.355339 100
## 5 0.0 0.000000 0
## 6 62.5 37.201190 75
## reproductive_discrim_min restricted_civil_liberties_mean
## 1 0 26.43333
## 2 0 35.96923
## 3 0 17.31538
## 4 0 24.94286
## 5 0 19.70000
## 6 0 10.40000
## restricted_civil_liberties_median restricted_civil_liberties_sd
## 1 25.50 19.329964
## 2 36.50 16.529891
## 3 17.75 5.656620
## 4 20.50 17.755267
## 5 19.70 3.959798
## 6 8.05 7.012438
## restricted_civil_liberties_max restricted_civil_liberties_min
## 1 52.3 2.1
## 2 55.4 11.5
## 3 27.5 3.1
## 4 61.7 5.9
## 5 22.5 16.9
## 6 26.9 4.5
## discrimination_in_the_family_mean discrimination_in_the_family_median
## 1 49.78889 44.3
## 2 56.70000 56.9
## 3 14.90769 8.3
## 4 26.91429 29.2
## 5 7.90000 7.9
## 6 21.72500 21.7
## discrimination_in_the_family_sd discrimination_in_the_family_max
## 1 20.16541 80.2
## 2 26.91694 89.1
## 3 13.04178 42.3
## 4 15.02114 49.8
## 5 0.00000 7.9
## 6 10.03106 36.7
## discrimination_in_the_family_min
## 1 18.8
## 2 14.6
## 3 0.0
## 4 10.6
## 5 7.9
## 6 8.5
Statistics overall (mean, median, sd, max, min):
statstable <- data.frame(Mean = sapply(data[variables], mean),
Median = sapply(data[variables], median),
`Standard Deviation` = sapply(data[variables], sd),
Maximum = sapply(data[variables], max),
Minimum = sapply(data[variables], min))
statstable
## Mean Median Standard.Deviation
## education 10.13553 10.88014 2.873973
## employment 61.85526 64.48421 15.800035
## absence_of_legal_discrimination 84.78846 87.50000 12.536216
## access_to_productive_and_financial_assets 21.56923 20.30000 14.253935
## reproductive_discrim 31.53846 0.00000 37.831071
## restricted_civil_liberties 22.35231 18.80000 14.653958
## discrimination_in_the_family 30.01231 21.90000 24.546903
## Maximum Minimum
## education 13.91498 2.161845
## employment 81.04615 14.213997
## absence_of_legal_discrimination 100.00000 46.875000
## access_to_productive_and_financial_assets 60.60000 0.800000
## reproductive_discrim 100.00000 0.000000
## restricted_civil_liberties 61.70000 2.100000
## discrimination_in_the_family 89.10000 0.000000
Means of just education and employment by continent (Q1+3):
minitable <- datacont %>% group_by(Continent) %>% summarize(`Mean Education` = mean(education),`Mean Employment` = mean(employment))
minitable
## # A tibble: 6 × 3
## Continent `Mean Education` `Mean Employment`
## <chr> <dbl> <dbl>
## 1 Africa 6.22 55.0
## 2 Asia 9.20 49.6
## 3 Europe 12.1 69.7
## 4 North America 9.38 58.5
## 5 Oceania 12.9 76.7
## 6 South America 9.52 63.3
Means of just education and access to financials by continent (Q2):
minitable2 <- datacont %>% group_by(Continent) %>% summarize(`Mean Education` = mean(education),`Mean Access` = mean(access_to_productive_and_financial_assets))
minitable2
## # A tibble: 6 × 3
## Continent `Mean Education` `Mean Access`
## <chr> <dbl> <dbl>
## 1 Africa 6.22 30.1
## 2 Asia 9.20 33.2
## 3 Europe 12.1 11.6
## 4 North America 9.38 20.9
## 5 Oceania 12.9 18.8
## 6 South America 9.52 26.5
Correlation Matrix:
cormatrix <- as.data.frame(cor(data[,variables]))
cormatrix
## education employment
## education 1.0000000 0.5649636
## employment 0.5649636 1.0000000
## absence_of_legal_discrimination 0.6404098 0.7090588
## access_to_productive_and_financial_assets -0.5384356 -0.5388207
## reproductive_discrim -0.5460163 -0.3990825
## restricted_civil_liberties -0.3184921 -0.3202537
## discrimination_in_the_family -0.6249280 -0.7398914
## absence_of_legal_discrimination
## education 0.6404098
## employment 0.7090588
## absence_of_legal_discrimination 1.0000000
## access_to_productive_and_financial_assets -0.7074188
## reproductive_discrim -0.5808030
## restricted_civil_liberties -0.6622231
## discrimination_in_the_family -0.8086819
## access_to_productive_and_financial_assets
## education -0.5384356
## employment -0.5388207
## absence_of_legal_discrimination -0.7074188
## access_to_productive_and_financial_assets 1.0000000
## reproductive_discrim 0.4734101
## restricted_civil_liberties 0.4340695
## discrimination_in_the_family 0.5854146
## reproductive_discrim
## education -0.5460163
## employment -0.3990825
## absence_of_legal_discrimination -0.5808030
## access_to_productive_and_financial_assets 0.4734101
## reproductive_discrim 1.0000000
## restricted_civil_liberties 0.3043340
## discrimination_in_the_family 0.4776794
## restricted_civil_liberties
## education -0.3184921
## employment -0.3202537
## absence_of_legal_discrimination -0.6622231
## access_to_productive_and_financial_assets 0.4340695
## reproductive_discrim 0.3043340
## restricted_civil_liberties 1.0000000
## discrimination_in_the_family 0.6327375
## discrimination_in_the_family
## education -0.6249280
## employment -0.7398914
## absence_of_legal_discrimination -0.8086819
## access_to_productive_and_financial_assets 0.5854146
## reproductive_discrim 0.4776794
## restricted_civil_liberties 0.6327375
## discrimination_in_the_family 1.0000000
VIF values:
reg_for_vif <- lm(employment ~ education + absence_of_legal_discrimination + access_to_productive_and_financial_assets + reproductive_discrim + restricted_civil_liberties + discrimination_in_the_family, data)
vif(reg_for_vif)
## education
## 2.081537
## absence_of_legal_discrimination
## 4.880672
## access_to_productive_and_financial_assets
## 2.064374
## reproductive_discrim
## 1.656528
## restricted_civil_liberties
## 2.008626
## discrimination_in_the_family
## 3.318365
Seperate Boxplots:
ggplot(data, aes(x = Continent, y = education)) + geom_boxplot() + labs(title = "Women's Education by Continent", y = "Education")
ggplot(data, aes(x = Continent, y = employment)) + geom_boxplot() + labs(title = "Women's Employment by Continent", y = "Employment")
Combined Boxplots:
ggplot(data) +
geom_boxplot(aes(x = Continent, y = education, fill = "education")) +
geom_boxplot(aes(x = Continent, y = employment, fill = "employment")) +
labs(title = "Boxplots of Education and Employment", y = "Education and Employment", x = "Continent")
slr_employment_education <- lm(employment ~ education, data = data)
ggplot(data, aes(x = education, y = employment)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs(title = "Education vs Employment")
## `geom_smooth()` using formula = 'y ~ x'
e <- resid(slr_employment_education)
qqnorm(e)
qqline(e, col = "blue")
plot(fitted(slr_employment_education), e)
abline(h = 0, col = "blue")
summary(slr_employment_education)
##
## Call:
## lm(formula = employment ~ education, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.443 -6.298 2.328 7.668 26.507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.3747 6.0174 5.048 4.07e-06 ***
## education 3.1060 0.5715 5.435 9.46e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.14 on 63 degrees of freedom
## Multiple R-squared: 0.3192, Adjusted R-squared: 0.3084
## F-statistic: 29.54 on 1 and 63 DF, p-value: 9.456e-07
The Q-Q plot shows the residuals are normally distributed with some extremes at the tails. There is no visual pattern in the residual plot. Data points for the predictor and response variables are independent. Conditions are valid for creating a linear regression model. The fitted equation for this model is:
\[\hat{Employment} = 31.8551 + 2.9900 * Education\]
The model itself is statistically significant from the p-value of its F-test, 9.456e-07 < \(\alpha\) = 0.05; however, \(R^2\) shows that only 31.92% of the variation in employment is explained by the linear association with education in the linear regression model. This regression could be improved by adding more predictor parameters in a multiple regression.
multiple_employment <- lm(employment ~ discrimination_in_the_family + restricted_civil_liberties + absence_of_legal_discrimination, data = data)
summary(multiple_employment)
##
## Call:
## lm(formula = employment ~ discrimination_in_the_family + restricted_civil_liberties +
## absence_of_legal_discrimination, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.9510 -4.4503 -0.5388 4.7802 22.2083
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03642 17.14084 0.761 0.449857
## discrimination_in_the_family -0.37606 0.08439 -4.456 3.63e-05 ***
## restricted_civil_liberties 0.39578 0.11097 3.566 0.000711 ***
## absence_of_legal_discrimination 0.60455 0.17076 3.540 0.000772 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.508 on 61 degrees of freedom
## Multiple R-squared: 0.6548, Adjusted R-squared: 0.6379
## F-statistic: 38.58 on 3 and 61 DF, p-value: 4.18e-14
Where DIF = discrimination_in_the_family, RCL = restricted_civil_liberties, and absence_of_legal_discrimination = ALD:
\[\hat{Employment} = 13.03642 - 0.37606 * DIF + 0.39578 * RCL + 0.60455 * ALD\]
The multiple regression with this combination of predictor variables results in a statistically significant model with statistically significant parameters. The \(R^2_{adj}\) = 0.6379 on account for adding the new predictors. This model explains an additional 31.87% variation in employment compared to the simple linear regression. The simple linear regression only took into account education, but this model focusing on the presence / absence of liberties and discrimination appears to be a better model of prediction.
ggplot(data, aes(x = discrimination_in_the_family + restricted_civil_liberties + absence_of_legal_discrimination, y = employment)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs(title = "Employment multiple regression")
## `geom_smooth()` using formula = 'y ~ x'
e <- resid(multiple_employment)
qqnorm(e)
qqline(e, col = "blue")
plot(fitted(multiple_employment), e)
abline(h = 0, col = "blue")
boxcox(multiple_employment)
Education vs Employment Plot by Continent:
ggplot(data, aes(x = education, y = employment, color = Continent)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs(title = "Education vs Employment by Country")
## `geom_smooth()` using formula = 'y ~ x'
Seperate Boxplots:
ggplot(data, aes(x = Continent, y = education)) + geom_boxplot() + labs(title = "Women's Education by Continent", y = "Education")
ggplot(data, aes(x = Continent, y = access_to_productive_and_financial_assets)) + geom_boxplot() + labs(title = "Women's Access to Productive and Financial Assets by Continent", y = "Access to Productive and Financial Assets")
Combined Boxplots:
boxplot <- ggplot(data) +
geom_boxplot(aes(x = Continent, y = education, fill = "education")) +
geom_boxplot(aes(x = Continent, y = access_to_productive_and_financial_assets, fill = "Access to Productive and Financial Assets")) +
labs(title = "Boxplots of Education and Access to Productive and Financial Assets", y = "Education and Access to Productive and Financial Assets", x = "Continent")
ggplotly(boxplot) %>% layout(boxmode = "group")
## Warning: 'layout' objects don't have these attributes: 'boxmode'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
Education vs Access by Continent:
ggplot(data, aes(x = education, y = access_to_productive_and_financial_assets, color = Continent)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs(title = "Education vs Access to Productive and Financial Assets")
## `geom_smooth()` using formula = 'y ~ x'